This section will contain the initial models to better understand the various time series datasets included in this project. This involves determining how to properly augment the data, selecting model parameters, fitting models, diagnosing them, and obtaining forecasts to contextualize results.
Note: Much of the code and process on this page is re-purposed from:
Gamage, P. (2026). Applied Time Series for Data Science.
Prior to fitting any time series modeling to these datasets, we must determine whether they are stationary to decide if data augmentation needs to be applied. Some of these steps were executed in the Exporatory Data Analysis section, so this will be a summation of what drives our decisions.
The following object is masked from 'package:dplyr':
combine
Code
ridership_ts <-ts(ridership$`Total Ridership (000s)`, start=c(1992,1), frequency =4)ridership_acf <-ggAcf(ridership_ts)+ggtitle("ACF Plot for Public Transit Ridership") +theme_bw() +geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") ridership_pacf <-ggPacf(ridership_ts)+ggtitle("PACF Plot for Public Transit Ridership") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") grid.arrange(ridership_acf, ridership_pacf, nrow=2)
Code
tseries::adf.test(na.remove(ridership_ts))
Augmented Dickey-Fuller Test
data: na.remove(ridership_ts)
Dickey-Fuller = -1.8774, Lag order = 5, p-value = 0.6275
alternative hypothesis: stationary
Code
mta_ts <-ts(mta$total_ridership, start=c(2020,3,1), frequency =365.25)mta_acf <-ggAcf(mta_ts, lag.max =30)+ggtitle("ACF Plot for MTA (New York City) Ridership") +theme_bw() +geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") mta_pacf <-ggPacf(mta_ts, lag.max =30)+ggtitle("PACF Plot for MTA (New York City) Ridership") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") grid.arrange(mta_acf, mta_pacf, nrow=2)
Code
tseries::adf.test(na.remove(mta_ts))
Warning in tseries::adf.test(na.remove(mta_ts)): p-value smaller than printed
p-value
Augmented Dickey-Fuller Test
data: na.remove(mta_ts)
Dickey-Fuller = -6.554, Lag order = 12, p-value = 0.01
alternative hypothesis: stationary
Code
cta_ts <-ts(cta$total_rides, start=c(2015,1,1), frequency =365.25)cta_acf <-ggAcf(cta_ts, lag.max =30)+ggtitle("ACF Plot for CTA (Chicago) Ridership") +theme_bw() +geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") cta_pacf <-ggPacf(cta_ts, lag.max =30)+ggtitle("PACF Plot for CTA (Chicago) Ridership") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") grid.arrange(cta_acf, cta_pacf, nrow=2)
Code
tseries::adf.test(na.remove(cta_ts))
Augmented Dickey-Fuller Test
data: na.remove(cta_ts)
Dickey-Fuller = -3.8003, Lag order = 15, p-value = 0.01895
alternative hypothesis: stationary
Code
cars_ts <-ts(cars$M12MTVUSM227NFWA, start=c(1993,1), frequency =12)cars_acf <-ggAcf(cars_ts, lag.max =30)+ggtitle("ACF Plot for Vehicle Miles") +theme_bw() +geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") cars_pacf <-ggPacf(cars_ts, lag.max =30)+ggtitle("PACF Plot for Vehicle Miles") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") grid.arrange(cars_acf, cars_pacf, nrow=2)
Code
tseries::adf.test(na.remove(cars_ts))
Augmented Dickey-Fuller Test
data: na.remove(cars_ts)
Dickey-Fuller = -2.7693, Lag order = 7, p-value = 0.2523
alternative hypothesis: stationary
Code
gas_ts <-ts(gas$`Price per Gallon`, start=c(1993,1), frequency =12)gas_acf <-ggAcf(gas_ts, lag.max =30)+ggtitle("ACF Plot for Gas Prices") +theme_bw() +geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") gas_pacf <-ggPacf(gas_ts, lag.max =30)+ggtitle("PACF Plot for Gas Prices") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") grid.arrange(gas_acf, gas_pacf, nrow=2)
Code
tseries::adf.test(na.remove(gas_ts))
Augmented Dickey-Fuller Test
data: na.remove(gas_ts)
Dickey-Fuller = -2.484, Lag order = 7, p-value = 0.3727
alternative hypothesis: stationary
Code
unemp_ts <-ts(unemp$UNRATE, start=c(1993,1), frequency =12)unemp_acf <-ggAcf(unemp_ts, lag.max =30)+ggtitle("ACF Plot for Unemployment Rate") +theme_bw() +geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") unemp_pacf <-ggPacf(unemp_ts, lag.max =30)+ggtitle("PACF Plot for Unemployment Rate") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") grid.arrange(unemp_acf, unemp_pacf, nrow=2)
Code
tseries::adf.test(na.remove(unemp_ts))
Augmented Dickey-Fuller Test
data: na.remove(unemp_ts)
Dickey-Fuller = -2.3338, Lag order = 7, p-value = 0.4361
alternative hypothesis: stationary
Uber
Code
uber_ts <-ts(uber$UBER.Close, start=c(2020,1,1), frequency =252)uber_acf <-ggAcf(uber_ts, lag.max =30)+ggtitle("ACF Plot for Uber Stock Price") +theme_bw() +geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") uber_pacf <-ggPacf(uber_ts)+ggtitle("PACF Plot for Uber Stock Price") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") grid.arrange(uber_acf, uber_pacf, nrow=2)
Code
tseries::adf.test(na.remove(uber_ts))
Augmented Dickey-Fuller Test
data: na.remove(uber_ts)
Dickey-Fuller = -1.5743, Lag order = 11, p-value = 0.7585
alternative hypothesis: stationary
Zoom
Code
zoom_ts <-ts(zoom$ZM.Close, start=c(2020,1,1), frequency =252)zoom_acf <-ggAcf(zoom_ts, lag.max =30)+ggtitle("ACF Plot for Zoom Stock Price") +theme_bw() +geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") zoom_pacf <-ggPacf(zoom_ts, lag.max =30)+ggtitle("PACF Plot for Zoom Stock Price") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") grid.arrange(zoom_acf, zoom_pacf, nrow=2)
Code
tseries::adf.test(na.remove(zoom_ts))
Augmented Dickey-Fuller Test
data: na.remove(zoom_ts)
Dickey-Fuller = -2.5477, Lag order = 11, p-value = 0.3465
alternative hypothesis: stationary
Exxon
Code
exxon_ts <-ts(exxon$XOM.Close, start=c(2020,1,1), frequency =252)exxon_acf <-ggAcf(exxon_ts, lag.max =30)+ggtitle("ACF Plot for Exxon Stock Price") +theme_bw() +geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") exxon_pacf <-ggPacf(exxon_ts, lag.max =30)+ggtitle("PACF Plot for Exxon Stock Price") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9") grid.arrange(exxon_acf, exxon_pacf, nrow=2)
Code
tseries::adf.test(na.remove(exxon_ts))
Augmented Dickey-Fuller Test
data: na.remove(exxon_ts)
Dickey-Fuller = -2.5443, Lag order = 11, p-value = 0.3479
alternative hypothesis: stationary
In all of these cases, the time series data are not stationary. This is evident due to the large values in the ACF plots, despite some ADF tests suggesting otherwise. Thus, we are likely going to need to difference the data in all cases. The following section compares differenced data to the original.
fit_ridership <-lm(na.remove(ridership_ts)~time(na.remove(ridership_ts)), na.action=NULL)plot1 <-ggAcf(ridership_ts, 48, main="Original Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <-ggAcf(resid(fit_ridership), 48, main="Detrended Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <-ggAcf(diff(ridership_ts), 48, main="First Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <-ggAcf(diff(diff(ridership_ts)), 48, main="Second Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)
Code
fit_mta <-lm(na.remove(mta_ts)~time(na.remove(mta_ts)), na.action=NULL)plot1 <-ggAcf(mta_ts, 48, main="Original Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <-ggAcf(resid(fit_mta), 48, main="Detrended Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <-ggAcf(diff(mta_ts), 48, main="First Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <-ggAcf(diff(diff(mta_ts)), 48, main="Second Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)
Code
fit_cta <-lm(na.remove(cta_ts)~time(na.remove(cta_ts)), na.action=NULL)plot1 <-ggAcf(cta_ts, 48, main="Original Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <-ggAcf(resid(fit_cta), 48, main="Detrended Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <-ggAcf(diff(cta_ts), 48, main="First Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <-ggAcf(diff(diff(cta_ts)), 48, main="Second Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)
Code
fit_cars <-lm(na.remove(cars_ts)~time(na.remove(cars_ts)), na.action=NULL)plot1 <-ggAcf(cars_ts, 48, main="Original Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <-ggAcf(resid(fit_cars), 48, main="Detrended Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <-ggAcf(diff(cars_ts), 48, main="First Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <-ggAcf(diff(diff(cars_ts)), 48, main="Second Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)
Code
fit_gas <-lm(na.remove(gas_ts)~time(na.remove(gas_ts)), na.action=NULL)plot1 <-ggAcf(gas_ts, 48, main="Original Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <-ggAcf(resid(fit_gas), 48, main="Detrended Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <-ggAcf(diff(gas_ts), 48, main="First Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <-ggAcf(diff(diff(gas_ts)), 48, main="Second Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)
Code
fit_unemp <-lm(na.remove(unemp_ts)~time(na.remove(unemp_ts)), na.action=NULL)plot1 <-ggAcf(unemp_ts, 48, main="Original Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <-ggAcf(resid(fit_unemp), 48, main="Detrended Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <-ggAcf(diff(unemp_ts), 48, main="First Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <-ggAcf(diff(diff(unemp_ts)), 48, main="Second Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)
Uber
Code
fit_uber <-lm(na.remove(uber_ts)~time(na.remove(uber_ts)), na.action=NULL)plot1 <-ggAcf(uber_ts, 48, main="Original Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <-ggAcf(resid(fit_uber), 48, main="Detrended Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <-ggAcf(diff(uber_ts), 48, main="First Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <-ggAcf(diff(diff(uber_ts)), 48, main="Second Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)
Zoom
Code
fit_zoom <-lm(na.remove(zoom_ts)~time(na.remove(zoom_ts)), na.action=NULL)plot1 <-ggAcf(zoom_ts, 48, main="Original Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <-ggAcf(resid(fit_zoom), 48, main="Detrended Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <-ggAcf(diff(zoom_ts), 48, main="First Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <-ggAcf(diff(diff(zoom_ts)), 48, main="Second Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)
Exxon
Code
fit_exxon <-lm(na.remove(exxon_ts)~time(na.remove(exxon_ts)), na.action=NULL)plot1 <-ggAcf(exxon_ts, 48, main="Original Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot2 <-ggAcf(resid(fit_exxon), 48, main="Detrended Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot3 <-ggAcf(diff(exxon_ts), 48, main="First Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
plot4 <-ggAcf(diff(diff(exxon_ts)), 48, main="Second Differenced Data") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `main`
Code
grid.arrange(plot1, plot2, plot3, plot4, nrow=4)
In each of these cases, it is clear that first-order differencing is both necessary and sufficient. To avoid over-differencing the data, we will choose \(d=1\) for all time series data. The next section will address other parameters.
Deciding Parameters
Based on the ACF and PACF plots of the original, detrended, and differenced data at multiple orders of difference, below we will outline chosen parameters \(p\), \(d\), and \(q\) for each time series.
plot1 <-ggAcf(diff(ridership_ts), 48, main="First Differenced Data - ACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")plot2 <-ggPacf(diff(ridership_ts), 48, main="First Differenced Data - PACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")grid.arrange(plot1, plot2, nrow=2)
Based on the low ACF values, it seems like an ARIMA(0,1,0) model would suffice here.
Code
plot1 <-ggAcf(diff(mta_ts), 48, main="First Differenced Data - ACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")plot2 <-ggPacf(diff(mta_ts), 48, main="First Differenced Data - PACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")grid.arrange(plot1, plot2, nrow=2)
These plots indicate that an ARIMA(7,1,2) model would be best.
Code
plot1 <-ggAcf(diff(cta_ts), 48, main="First Differenced Data - ACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")plot2 <-ggPacf(diff(cta_ts), 48, main="First Differenced Data - PACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")grid.arrange(plot1, plot2, nrow=2)
These plots indicate that an ARIMA(7,1,2) model would be best.
Code
plot1 <-ggAcf(diff(cars_ts), 48, main="First Differenced Data - ACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")plot2 <-ggPacf(diff(cars_ts), 48, main="First Differenced Data - PACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")grid.arrange(plot1, plot2, nrow=2)
Due to the clear seasonal component, these plots indicate that a SARIMA model with seasonal period equal to 12 would be best. The other parameters are difficult to capture here, so we will use auto.arima() for that.
Code
plot1 <-ggAcf(diff(gas_ts), 48, main="First Differenced Data - ACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")plot2 <-ggPacf(diff(gas_ts), 48, main="First Differenced Data - PACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")grid.arrange(plot1, plot2, nrow=2)
Due to the clear seasonal component, these plots indicate that a SARIMA model with seasonal period equal to 12 would be best. The other parameters are difficult to capture here, so we will use auto.arima() for that.
Code
plot1 <-ggAcf(diff(unemp_ts), 48, main="First Differenced Data - ACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")plot2 <-ggPacf(diff(unemp_ts), 48, main="First Differenced Data - PACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")grid.arrange(plot1, plot2, nrow=2)
These plots indicate that an ARIMA(2,1,2) model would be best
Uber
Code
plot1 <-ggAcf(diff(uber_ts), 48, main="First Differenced Data - ACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")plot2 <-ggPacf(diff(uber_ts), 48, main="First Differenced Data - PACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")grid.arrange(plot1, plot2, nrow=2)
Zoom
Code
plot1 <-ggAcf(diff(zoom_ts), 48, main="First Differenced Data - ACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")plot2 <-ggPacf(diff(zoom_ts), 48, main="First Differenced Data - PACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")grid.arrange(plot1, plot2, nrow=2)
Exxon
Code
plot1 <-ggAcf(diff(exxon_ts), 48, main="First Differenced Data - ACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")plot2 <-ggPacf(diff(exxon_ts), 48, main="First Differenced Data - PACF") +theme_bw()+geom_segment(lineend ="butt", color ="#68A2B9") +geom_hline(yintercept =0, color ="#68A2B9")grid.arrange(plot1, plot2, nrow=2)
For each financial time series, it appears that ARIMA(0,1,0) is the most appropriate model.
Using auto.arima()
While it can be useful to select model parameters based on exploratory data analysis from ACF and PACF plots, there is value in finding the best model via computational methods. In this section, we will apply the auto.arima() function to each time series to get a second option (or validate our first option) for fitting a model.
In this section we will simply fit a range of models using the parameters selected above (via either the parameters selected via plotting or the auto.arima() method) to determine the ranges of each parameters. These will be fit using the Arima() function in R and include all necessary components including order and seasonality. Thus, time series that don’t require seasonal analysis will be simple ARIMA models, while those that do will be SARIMA models. The best model will be chosen based on a combination of AIC and BIC scores.
Below we can see model diagnostics such as AIC and BIC for each final univariate model. These will be useful when comparing to multivariate, financial, and deep learning models in order to evaluate which methods are best at matching time series data for this project.
Now that our models have been fit and diagnostics have been recorded, we can get a better sense of what these models actually predict by forecasting several time steps ahead. This will better contextualize what the models actually learn, and provide valuable insight towards what we can expect in the future.
ridership_pred <-forecast(ridership_fit, h =20)accuracy(ridership_pred)
ME RMSE MAE MPE MAPE MASE ACF1
Training set -4682.87 156042.4 54633.29 -1.286491 4.18834 0.4239048 -0.0271934
Code
autoplot(ridership_pred) +labs(title ="ARIMA(1,1,2)(1,0,1)[4] Forecast for National Ridership",x ="Time",y ="Predicted Values") +theme_minimal()
This forecast indicates that that ridership is expected to maintain its seasonal pattern while trending slightly down starting in the next time step. As we can see from the confidence bands, this is not a very assured prediction. It is very likely that the variance caused in 2020, which we saw was not totally accounted for in the differencing step, is still causing confusion. This prediction is quite counter-intuitive seeing the gradual rise we’ve observed since the COVID-19 dip.
Code
mta_pred <-forecast(mta_fit, 365)autoplot(mta_pred) +labs(title ="ARIMA(7,1,2) Forecast for New York Ridership",x ="Time",y ="Predicted Values") +theme_minimal()
This prediction appears to indicate that ridership in New York will continue to gradually rise, while losing variance as time goes on. Once again, these are not very confident predictions. It is important to note that we observed both weekly and yearly seasonality in this data. ARIMA models notably struggle when dealing with concurrent seasonal periods, so this could be contributing to an inability to produce a coherent forecast.
Code
cta_pred <-forecast(cta_fit, 365)autoplot(cta_pred) +labs(title ="ARIMA(7,1,2) Forecast for Chicago Ridership",x ="Time",y ="Predicted Values") +theme_minimal()
This prediction appears to indicate that ridership in Chicago will gradually fall, while losing variance as time goes on. Once again, these are not very confident predictions. It is important to note that we observed both weekly and yearly seasonality in this data. ARIMA models notably struggle when dealing with concurrent seasonal periods, so this could be contributing to an inability to produce a coherent forecast.
Code
ridership_pred <-forecast(ridership_fit, h =20)accuracy(ridership_pred)
ME RMSE MAE MPE MAPE MASE ACF1
Training set -4682.87 156042.4 54633.29 -1.286491 4.18834 0.4239048 -0.0271934
This forecast does not tell us much, as our model appears to have struggled to create a coherent prediction. Ultimately, the forecast predicts a rather constant value with very wide confidence bands.
Code
gas_pred <-forecast(gas_fit, 36)autoplot(gas_pred) +labs(title ="ARIMA(3,1,1)(2,0,0)[12] Forecast for Gas Prices",x ="Time",y ="Predicted Values") +theme_minimal()
This forecast does not tell us much, as our model appears to have struggled to create a coherent prediction. Ultimately, the forecast predicts a rather constant value with very wide confidence bands.
This forecast does not tell us much, as our model appears to have struggled to create a coherent prediction. Ultimately, the forecast predicts a rather constant value with very wide confidence bands.
These forecasts do not tell us much, as our models appear to have struggled to create coherent predictions. Ultimately, the forecasts predict rather constant values with very wide confidence bands.
Comparison with Benchmark Methods
Finally, to truly understand the value of univariate time series analysis via fitting ARIMA/SARIMA models, it is important to compare with benchmark methods and see how they stack up. The following plots include the ARIMA forecasts alongside a mean forecast, a naïve forecast, and a drift forecast.
mean_ridership <-meanf(ridership_ts, h =20)naive_ridership <-naive(ridership_ts, h =20)drift_ridership <-rwf(ridership_ts, drift =TRUE, h =20)arima_ridership <- ridership_predmean_ridership_df <-data.frame(Date =time(mean_ridership$mean), Mean =as.numeric(mean_ridership$mean))naive_ridership_df <-data.frame(Date =time(naive_ridership$mean), Naive =as.numeric(naive_ridership$mean))drift_ridership_df <-data.frame(Date =time(drift_ridership$mean), Drift =as.numeric(drift_ridership$mean))arima_ridership_df <-data.frame(Date =time(arima_ridership$mean), ARIMA_Fit =as.numeric(arima_ridership$mean))ridership_ts_df <-data.frame(Date =time(ridership_ts), Ridership =as.numeric(ridership_ts))plot_ly() %>%add_lines(data = ridership_ts_df, x =~Date, y =~Ridership, name ='Original Data', line =list(color ='black')) %>%add_lines(data = mean_ridership_df, x =~Date, y =~Mean, name ='Mean Forecast', line =list(color ='blue')) %>%add_lines(data = naive_ridership_df, x =~Date, y =~Naive, name ='Naïve Forecast', line =list(color ='red')) %>%add_lines(data = drift_ridership_df, x =~Date, y =~Drift, name ='Drift Forecast', line =list(color ='green')) %>%add_lines(data = arima_ridership_df, x =~Date, y =~ARIMA_Fit, name ='ARIMA Fit', line =list(color ='purple')) %>%layout(title ='Ridership Forecast',xaxis =list(title ='Date'),yaxis =list(title ='Number of Rides (000s)'),legend =list(title =list(text ='Forecast Methods')))
Code
mean_mta <-meanf(mta_ts, h =365)naive_mta <-naive(mta_ts, h =365)drift_mta <-rwf(mta_ts, drift =TRUE, h =365)arima_mta <- mta_predmean_mta_df <-data.frame(Date =time(mean_mta$mean), Mean =as.numeric(mean_mta$mean))naive_mta_df <-data.frame(Date =time(naive_mta$mean), Naive =as.numeric(naive_mta$mean))drift_mta_df <-data.frame(Date =time(drift_mta$mean), Drift =as.numeric(drift_mta$mean))arima_mta_df <-data.frame(Date =time(arima_mta$mean), ARIMA_Fit =as.numeric(arima_mta$mean))mta_ts_df <-data.frame(Date =time(mta_ts), Ridership =as.numeric(mta_ts))plot_ly() %>%add_lines(data = mta_ts_df, x =~Date, y =~Ridership, name ='Original Data', line =list(color ='black')) %>%add_lines(data = mean_mta_df, x =~Date, y =~Mean, name ='Mean Forecast', line =list(color ='blue')) %>%add_lines(data = naive_mta_df, x =~Date, y =~Naive, name ='Naïve Forecast', line =list(color ='red')) %>%add_lines(data = drift_mta_df, x =~Date, y =~Drift, name ='Drift Forecast', line =list(color ='green')) %>%add_lines(data = arima_mta_df, x =~Date, y =~ARIMA_Fit, name ='ARIMA Fit', line =list(color ='purple')) %>%layout(title ='MTA Ridership Forecast',xaxis =list(title ='Date'),yaxis =list(title ='Number of Rides'),legend =list(title =list(text ='Forecast Methods')))
Code
mean_cta <-meanf(cta_ts, h =365)naive_cta <-naive(cta_ts, h =365)drift_cta <-rwf(cta_ts, drift =TRUE, h =365)arima_cta <- cta_predmean_cta_df <-data.frame(Date =time(mean_cta$mean), Mean =as.numeric(mean_cta$mean))naive_cta_df <-data.frame(Date =time(naive_cta$mean), Naive =as.numeric(naive_cta$mean))drift_cta_df <-data.frame(Date =time(drift_cta$mean), Drift =as.numeric(drift_cta$mean))arima_cta_df <-data.frame(Date =time(arima_cta$mean), ARIMA_Fit =as.numeric(arima_cta$mean))cta_ts_df <-data.frame(Date =time(cta_ts), Ridership =as.numeric(cta_ts))plot_ly() %>%add_lines(data = cta_ts_df, x =~Date, y =~Ridership, name ='Original Data', line =list(color ='black')) %>%add_lines(data = mean_cta_df, x =~Date, y =~Mean, name ='Mean Forecast', line =list(color ='blue')) %>%add_lines(data = naive_cta_df, x =~Date, y =~Naive, name ='Naïve Forecast', line =list(color ='red')) %>%add_lines(data = drift_cta_df, x =~Date, y =~Drift, name ='Drift Forecast', line =list(color ='green')) %>%add_lines(data = arima_cta_df, x =~Date, y =~ARIMA_Fit, name ='ARIMA Fit', line =list(color ='purple')) %>%layout(title ='CTA Ridership Forecast',xaxis =list(title ='Date'),yaxis =list(title ='Number of Rides'),legend =list(title =list(text ='Forecast Methods')))
Code
mean_cars <-meanf(cars_ts, h =36)naive_cars <-naive(cars_ts, h =36)drift_cars <-rwf(cars_ts, drift =TRUE, h =36)arima_cars <- cars_predmean_cars_df <-data.frame(Date =time(mean_cars$mean), Mean =as.numeric(mean_cars$mean))naive_cars_df <-data.frame(Date =time(naive_cars$mean), Naive =as.numeric(naive_cars$mean))drift_cars_df <-data.frame(Date =time(drift_cars$mean), Drift =as.numeric(drift_cars$mean))arima_cars_df <-data.frame(Date =time(arima_cars$mean), ARIMA_Fit =as.numeric(arima_cars$mean))cars_ts_df <-data.frame(Date =time(cars_ts), Miles =as.numeric(cars_ts))plot_ly() %>%add_lines(data = cars_ts_df, x =~Date, y =~Miles, name ='Original Data', line =list(color ='black')) %>%add_lines(data = mean_cars_df, x =~Date, y =~Mean, name ='Mean Forecast', line =list(color ='blue')) %>%add_lines(data = naive_cars_df, x =~Date, y =~Naive, name ='Naïve Forecast', line =list(color ='red')) %>%add_lines(data = drift_cars_df, x =~Date, y =~Drift, name ='Drift Forecast', line =list(color ='green')) %>%add_lines(data = arima_cars_df, x =~Date, y =~ARIMA_Fit, name ='ARIMA Fit', line =list(color ='purple')) %>%layout(title ='Vehicle Miles Forecast',xaxis =list(title ='Date'),yaxis =list(title ='Million Vehicle Miles'),legend =list(title =list(text ='Forecast Methods')))
Code
mean_gas <-meanf(gas_ts, h =36)naive_gas <-naive(gas_ts, h =36)drift_gas <-rwf(gas_ts, drift =TRUE, h =36)arima_gas <- gas_predmean_gas_df <-data.frame(Date =time(mean_gas$mean), Mean =as.numeric(mean_gas$mean))naive_gas_df <-data.frame(Date =time(naive_gas$mean), Naive =as.numeric(naive_gas$mean))drift_gas_df <-data.frame(Date =time(drift_gas$mean), Drift =as.numeric(drift_gas$mean))arima_gas_df <-data.frame(Date =time(arima_gas$mean), ARIMA_Fit =as.numeric(arima_gas$mean))gas_ts_df <-data.frame(Date =time(gas_ts), Prices =as.numeric(gas_ts))plot_ly() %>%add_lines(data = gas_ts_df, x =~Date, y =~Prices, name ='Original Data', line =list(color ='black')) %>%add_lines(data = mean_gas_df, x =~Date, y =~Mean, name ='Mean Forecast', line =list(color ='blue')) %>%add_lines(data = naive_gas_df, x =~Date, y =~Naive, name ='Naïve Forecast', line =list(color ='red')) %>%add_lines(data = drift_gas_df, x =~Date, y =~Drift, name ='Drift Forecast', line =list(color ='green')) %>%add_lines(data = arima_gas_df, x =~Date, y =~ARIMA_Fit, name ='ARIMA Fit', line =list(color ='purple')) %>%layout(title ='Gas Prices Forecast',xaxis =list(title ='Date'),yaxis =list(title ='Average Gas Price'),legend =list(title =list(text ='Forecast Methods')))
Code
mean_unemp <-meanf(unemp_ts, h =36)naive_unemp <-naive(unemp_ts, h =36)drift_unemp <-rwf(unemp_ts, drift =TRUE, h =36)arima_unemp <- unemp_predmean_unemp_df <-data.frame(Date =time(mean_unemp$mean), Mean =as.numeric(mean_unemp$mean))naive_unemp_df <-data.frame(Date =time(naive_unemp$mean), Naive =as.numeric(naive_unemp$mean))drift_unemp_df <-data.frame(Date =time(drift_unemp$mean), Drift =as.numeric(drift_unemp$mean))arima_unemp_df <-data.frame(Date =time(arima_unemp$mean), ARIMA_Fit =as.numeric(arima_unemp$mean))unemp_ts_df <-data.frame(Date =time(unemp_ts), Rate =as.numeric(unemp_ts))plot_ly() %>%add_lines(data = unemp_ts_df, x =~Date, y =~Rate, name ='Original Data', line =list(color ='black')) %>%add_lines(data = mean_unemp_df, x =~Date, y =~Mean, name ='Mean Forecast', line =list(color ='blue')) %>%add_lines(data = naive_unemp_df, x =~Date, y =~Naive, name ='Naïve Forecast', line =list(color ='red')) %>%add_lines(data = drift_unemp_df, x =~Date, y =~Drift, name ='Drift Forecast', line =list(color ='green')) %>%add_lines(data = arima_unemp_df, x =~Date, y =~ARIMA_Fit, name ='ARIMA Fit', line =list(color ='purple')) %>%layout(title ='Unemployment Rate Forecast',xaxis =list(title ='Date'),yaxis =list(title ='Unemployment Rate'),legend =list(title =list(text ='Forecast Methods')))
Uber
Code
mean_uber <-meanf(uber_ts, h =252)naive_uber <-naive(uber_ts, h =252)drift_uber <-rwf(uber_ts, drift =TRUE, h =252)arima_uber <- uber_predmean_uber_df <-data.frame(Date =time(mean_uber$mean), Mean =as.numeric(mean_uber$mean))naive_uber_df <-data.frame(Date =time(naive_uber$mean), Naive =as.numeric(naive_uber$mean))drift_uber_df <-data.frame(Date =time(drift_uber$mean), Drift =as.numeric(drift_uber$mean))arima_uber_df <-data.frame(Date =time(arima_uber$mean), ARIMA_Fit =as.numeric(arima_uber$mean))uber_ts_df <-data.frame(Date =time(uber_ts), Stock =as.numeric(uber_ts))plot_ly() %>%add_lines(data = uber_ts_df, x =~Date, y =~Stock, name ='Original Data', line =list(color ='black')) %>%add_lines(data = mean_uber_df, x =~Date, y =~Mean, name ='Mean Forecast', line =list(color ='blue')) %>%add_lines(data = naive_uber_df, x =~Date, y =~Naive, name ='Naïve Forecast', line =list(color ='red')) %>%add_lines(data = drift_uber_df, x =~Date, y =~Drift, name ='Drift Forecast', line =list(color ='green')) %>%add_lines(data = arima_uber_df, x =~Date, y =~ARIMA_Fit, name ='ARIMA Fit', line =list(color ='purple')) %>%layout(title ='Uber Stock Price Forecast',xaxis =list(title ='Date'),yaxis =list(title ='Stock Price'),legend =list(title =list(text ='Forecast Methods')))
Zoom
Code
mean_zoom <-meanf(zoom_ts, h =252)naive_zoom <-naive(zoom_ts, h =252)drift_zoom <-rwf(zoom_ts, drift =TRUE, h =252)arima_zoom <- zoom_predmean_zoom_df <-data.frame(Date =time(mean_zoom$mean), Mean =as.numeric(mean_zoom$mean))naive_zoom_df <-data.frame(Date =time(naive_zoom$mean), Naive =as.numeric(naive_zoom$mean))drift_zoom_df <-data.frame(Date =time(drift_zoom$mean), Drift =as.numeric(drift_zoom$mean))arima_zoom_df <-data.frame(Date =time(arima_zoom$mean), ARIMA_Fit =as.numeric(arima_zoom$mean))zoom_ts_df <-data.frame(Date =time(zoom_ts), Stock =as.numeric(zoom_ts))plot_ly() %>%add_lines(data = zoom_ts_df, x =~Date, y =~Stock, name ='Original Data', line =list(color ='black')) %>%add_lines(data = mean_zoom_df, x =~Date, y =~Mean, name ='Mean Forecast', line =list(color ='blue')) %>%add_lines(data = naive_zoom_df, x =~Date, y =~Naive, name ='Naïve Forecast', line =list(color ='red')) %>%add_lines(data = drift_zoom_df, x =~Date, y =~Drift, name ='Drift Forecast', line =list(color ='green')) %>%add_lines(data = arima_zoom_df, x =~Date, y =~ARIMA_Fit, name ='ARIMA Fit', line =list(color ='purple')) %>%layout(title ='Zoom Stock Price Forecast',xaxis =list(title ='Date'),yaxis =list(title ='Stock Price'),legend =list(title =list(text ='Forecast Methods')))
Exxon
Code
mean_exxon <-meanf(exxon_ts, h =252)naive_exxon <-naive(exxon_ts, h =252)drift_exxon <-rwf(exxon_ts, drift =TRUE, h =252)arima_exxon <- exxon_predmean_exxon_df <-data.frame(Date =time(mean_exxon$mean), Mean =as.numeric(mean_exxon$mean))naive_exxon_df <-data.frame(Date =time(naive_exxon$mean), Naive =as.numeric(naive_exxon$mean))drift_exxon_df <-data.frame(Date =time(drift_exxon$mean), Drift =as.numeric(drift_exxon$mean))arima_exxon_df <-data.frame(Date =time(arima_exxon$mean), ARIMA_Fit =as.numeric(arima_exxon$mean))exxon_ts_df <-data.frame(Date =time(exxon_ts), Stock =as.numeric(exxon_ts))plot_ly() %>%add_lines(data = exxon_ts_df, x =~Date, y =~Stock, name ='Original Data', line =list(color ='black')) %>%add_lines(data = mean_exxon_df, x =~Date, y =~Mean, name ='Mean Forecast', line =list(color ='blue')) %>%add_lines(data = naive_exxon_df, x =~Date, y =~Naive, name ='Naïve Forecast', line =list(color ='red')) %>%add_lines(data = drift_exxon_df, x =~Date, y =~Drift, name ='Drift Forecast', line =list(color ='green')) %>%add_lines(data = arima_exxon_df, x =~Date, y =~ARIMA_Fit, name ='ARIMA Fit', line =list(color ='purple')) %>%layout(title ='Exxon Stock Price Forecast',xaxis =list(title ='Date'),yaxis =list(title ='Stock Price'),legend =list(title =list(text ='Forecast Methods')))
In all of these cases, the mean forecast is rather useless, as it begins from a value incongruent with the current time step. ARIMA fits tend to provide more information than benchmark methods when they have a seasonal component, but the reality is that we have not gotten particularly valuable insights from univariate time series analysis. Moving forward, it will be valuable to see how the presence of exogenous variables or more complex time series analysis methods contribute to improved forecasts.